Import useful stuff and define ancillary functions


In [1]:
%pylab inline
%load_ext autoreload
%autoreload 2

from __future__ import division

from collections import defaultdict, namedtuple
import cPickle as pickle
from datetime import datetime, timedelta
from functools import partial
import inspect
import json
import os
import re
import sys

import numpy as np
import pandas as pd
import seaborn as sn
import sklearn as sl


Populating the interactive namespace from numpy and matplotlib

Load train data

Using pandas' read_csv with all the defaults


In [2]:
if os.name == 'nt':
    TRAIN_PATH = r'D:\train.csv'
    PTRAIN_PATH = r'D:\train_preprocessed_all.csv'
    TEST_PATH = r'D:\test.csv'
    GOOGNEWS_PATH = r'D:\GoogleNews-vectors-negative300.bin.gz'
    VOCAB_PATH = r'D:\big.txt'
else:
    TRAIN_PATH = r'/media/mtambos/speedy/train.csv'
    PTRAIN_PATH = r'/media/mtambos/speedy/train_preprocessed_all.csv'
    TEST_PATH = r'/media/mtambos/speedy/test.csv'
    GOOGNEWS_PATH = r'/media/mtambos/speedy/GoogleNews-vectors-negative300.bin.gz'
    VOCAB_PATH = r'/media/mtambos/speedy/big.txt'
#df_orig = pd.read_csv(TRAIN_PATH, index_col="ID")
df = pd.read_csv(PTRAIN_PATH, index_col="ID")
#df


/home/mtambos/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (8,9,10,11,12,23,158,174,177,178,180,184,187) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)

Define columns


In [25]:
bool_cols = ['VAR_0008', 'VAR_0009', 'VAR_0010',
             'VAR_0011', 'VAR_0012', 'VAR_0043',
             'VAR_0196', 'VAR_0226', 'VAR_0229',
             'VAR_0230', 'VAR_0232', 'VAR_0236',
             'VAR_0239']
bool_cols += [c for c in df.columns if df[c].dtype == bool]
bool_cols = list(set(bool_cols))

int_cols = ['VAR_0013', 'VAR_0045', 'VAR_0198',
            'VAR_0227', 'VAR_0231', 'VAR_0233',
            'VAR_0237', 'VAR_0241']
int_cols += [c for c in df.columns if df[c].dtype == int]
int_cols = list(set(int_cols))

See if the classes are skewed


In [5]:
neg_samples_count = len(df['target'][df['target']==0])
pos_samples_count = len(df['target'][df['target']==1])
print '%s negative samples; %.2f%% of total' % (neg_samples_count, neg_samples_count/len(df)*100)
print '%s positive samples; %.2f%% of total' % (pos_samples_count, pos_samples_count/len(df)*100)


111458 negative samples; 76.75% of total
33773 positive samples; 23.25% of total

Cast bool columns as bool


In [26]:
def clean_bools(bool_val):
    if isinstance(bool_val, bool):
        return bool_val
    try:
        return bool(bool_val)
    except:
        return np.nan

df[bool_cols] = df[bool_cols].applymap(clean_bools)
df[bool_cols] = df[bool_cols].astype(np.bool)
df[bool_cols]


Out[26]:
VAR_0232 VAR_0229 VAR_0239 VAR_0236 VAR_0226 VAR_0230 VAR_0008 VAR_0009 VAR_0043 VAR_0196 VAR_0011 VAR_0010 VAR_0012
ID
2 True False False True False False False False False False False False False
4 False False False True False False False False False False False False False
5 True False False True False False False False False False False False False
7 False False False True False False False False False False False False False
8 True False False True False False False False False False False False False
14 True False False True False False False False False False False False False
16 False False False True False False False False False False False False False
20 True False False True False False False False False False False False False
21 False False False True False False False False False False False False False
22 True False False True False False False False False False False False False
23 False False False True False False False False False False False False False
24 True False False True False False False False False False False False False
25 False False False True False False False False False False False False False
26 True False False True False False False False False False False False False
28 True False False True False False False False False False False False False
30 True False False True False False False False False False False False False
31 True False False True False False False False False False False False False
32 True False False True False False False False False False False False False
35 False False False True False False False False False False False False False
36 True False False True False False False False False False False False False
37 True False False True False False False False False False False False False
38 False False False True False False False False False False False False False
40 True False False True False False False False False False False False False
42 True False False True False False False False False False False False False
43 True False False True False False False False False False False False False
46 True False False True False False False False False False False False False
50 True False False True False False False False False False False False False
51 False False False True False False False False False False False False False
52 False False False True False False False False False False False False False
54 True False False True False False False False False False False False False
... ... ... ... ... ... ... ... ... ... ... ... ... ...
290409 True False False True False False False False False False False False False
290412 True False False True False False False False False False False False False
290414 False False False True False False False False False False False False False
290415 True False False True False False False False False False False False False
290417 False False False True True False False False False False False False False
290424 True False False True False False False False False False False False False
290426 True False False True False False False False False False False False False
290427 True False False True False False False False False False False False False
290429 False False False True False False False False False False False False False
290431 True False False True False False False False False False False False False
290432 True False False True False False False False False False False False False
290434 True False False True False False False False False False False False False
290436 False False False True False False False False False False False False False
290439 True False False True True False False False False False False False False
290440 True False False True False False False False False False False False False
290441 False False False True False False False False False False False False False
290443 True False False True False False False False False False False False False
290445 True False False True False False False False False False False False False
290447 False False False True False False False False False False False False False
290448 False False False True False False False False False False False False False
290449 True False False True False False False False False False False False False
290450 False False False True False False False False False False False False False
290452 True False False True False False False False False False False False False
290453 False False False True False False False False False False False False False
290454 False False False True False False False False False False False False False
290457 False False False True False False False False False False False False False
290458 False False False True False False False False False False False False False
290459 False False False True False False False False False False False False False
290461 True False False True False False False False False False False False False
290463 True False False True False False False False False False False False False

145231 rows × 13 columns

Cast int columns as int


In [27]:
def clean_ints(int_val):
    if isinstance(int_val, int):
        return int_val
    try:
        return int(int_val)
    except:
        return np.nan

df[int_cols] = df[int_cols].applymap(clean_ints)
df[int_cols] = df[int_cols].astype(np.int)
df[int_cols]


Out[27]:
VAR_1597 VAR_1596 VAR_1595 VAR_1594 VAR_1593 VAR_1592 VAR_1591 VAR_1590 VAR_1599 VAR_1598 ... VAR_1166 VAR_1167 VAR_1164 VAR_1165 VAR_1724 VAR_1604 VAR_1666 VAR_1867 VAR_1866 VAR_1725
ID
2 97 97 97 97 1 97 97 0 97 0 ... 0 0 0 0 98 97 0 98 0 98
4 0 0 0 0 1 1 1 0 0 0 ... 0 0 0 0 98 0 0 98 0 98
5 0 0 0 0 1 1 1 0 0 1 ... 0 0 0 0 98 0 0 98 0 98
7 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
8 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
14 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
16 2 0 0 0 0 0 0 0 2 3 ... 0 0 0 0 1 2 3 98 0 0
20 98 98 98 98 98 98 98 0 98 98 ... 1 0 0 0 98 98 98 0 1 98
21 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 1 2 98
22 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
23 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
24 98 98 98 98 98 98 98 0 98 98 ... 0 1 0 0 98 98 98 98 0 98
25 1 0 0 0 0 0 0 0 1 1 ... 0 0 0 0 2 1 1 98 0 0
26 98 98 98 98 98 98 98 0 98 98 ... 0 3 0 0 98 98 98 98 0 98
28 0 0 0 0 1 1 1 0 0 0 ... 0 0 0 0 98 0 0 98 0 98
30 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
31 98 98 98 98 98 98 98 0 98 98 ... 0 1 0 0 98 98 98 98 0 98
32 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
35 0 0 0 0 0 0 1 0 0 1 ... 0 0 0 0 98 0 0 98 0 98
36 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
37 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
38 97 97 97 97 97 97 97 0 97 2 ... 0 0 0 0 98 97 97 98 0 98
40 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 0 3 98
42 99 99 99 99 99 99 99 99 99 99 ... 0 0 0 0 99 99 99 99 99 99
43 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
46 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
50 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
51 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
52 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
54 98 98 98 98 98 98 98 0 98 98 ... 0 1 0 0 98 98 98 98 0 98
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
290409 0 0 0 0 2 1 1 0 0 0 ... 0 0 0 0 98 0 0 98 0 98
290412 98 98 98 98 98 98 98 0 98 98 ... 0 1 0 0 98 98 98 98 0 98
290414 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
290415 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
290417 98 98 98 98 98 98 98 0 98 98 ... 3 0 0 0 98 98 98 98 0 98
290424 0 0 0 1 0 0 0 0 1 1 ... 0 0 0 0 98 0 0 98 0 98
290426 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
290427 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
290429 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 0 5 98
290431 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
290432 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
290434 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 1 1 98
290436 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 1 10 98
290439 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
290440 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
290441 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 0 2 98
290443 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
290445 0 0 0 0 1 1 1 0 0 1 ... 2 0 0 0 98 0 0 98 0 98
290447 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 0 5 98
290448 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
290449 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 0 1 98
290450 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
290452 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
290453 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
290454 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
290457 0 0 0 0 1 1 1 0 0 0 ... 0 0 0 0 1 0 0 98 0 0
290458 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
290459 98 98 98 98 98 98 98 0 98 98 ... 0 0 0 0 98 98 98 98 0 98
290461 0 0 0 0 1 1 1 0 0 1 ... 0 0 0 0 98 0 0 98 0 98
290463 0 0 0 0 1 1 1 0 0 2 ... 0 0 0 0 1 0 0 98 0 0

145231 rows × 624 columns

Remove columns with low standard deviation


In [28]:
X = df.std(skipna=True)
X.sort()

In [30]:
zero_std_cols = X[X == 0].index.tolist()
zero_std_cols


Out[30]:
[]

Drop duplicated rows and columns

Drop duplicate rows


In [21]:
df.drop_duplicates(inplace=True)

Drop duplicate bool columns


In [32]:
cols_to_drop = set()
for i in range(len(bool_cols) - 1):
    c1 = bool_cols[i]
    print "Checking duplicates of column " + c1
    for j in range(i+1, len(bool_cols)):
        c2 = bool_cols[j]
        if (df[c1] == df[c2]).all():
            cols_to_drop.add(c2)
    print "Duplicates found: " + str(cols_to_drop)


Checking duplicates of column VAR_0232
Duplicates found: set([])
Checking duplicates of column VAR_0229
Duplicates found: set(['VAR_0239', 'VAR_0008', 'VAR_0009', 'VAR_0043', 'VAR_0196', 'VAR_0011', 'VAR_0010', 'VAR_0012'])
Checking duplicates of column VAR_0239
Duplicates found: set(['VAR_0239', 'VAR_0008', 'VAR_0009', 'VAR_0043', 'VAR_0196', 'VAR_0011', 'VAR_0010', 'VAR_0012'])
Checking duplicates of column VAR_0236
Duplicates found: set(['VAR_0239', 'VAR_0008', 'VAR_0009', 'VAR_0043', 'VAR_0196', 'VAR_0011', 'VAR_0010', 'VAR_0012'])
Checking duplicates of column VAR_0226
Duplicates found: set(['VAR_0239', 'VAR_0008', 'VAR_0009', 'VAR_0043', 'VAR_0196', 'VAR_0011', 'VAR_0010', 'VAR_0012'])
Checking duplicates of column VAR_0230
Duplicates found: set(['VAR_0239', 'VAR_0008', 'VAR_0009', 'VAR_0043', 'VAR_0196', 'VAR_0011', 'VAR_0010', 'VAR_0012'])
Checking duplicates of column VAR_0008
Duplicates found: set(['VAR_0239', 'VAR_0008', 'VAR_0009', 'VAR_0043', 'VAR_0196', 'VAR_0011', 'VAR_0010', 'VAR_0012'])
Checking duplicates of column VAR_0009
Duplicates found: set(['VAR_0239', 'VAR_0008', 'VAR_0009', 'VAR_0043', 'VAR_0196', 'VAR_0011', 'VAR_0010', 'VAR_0012'])
Checking duplicates of column VAR_0043
Duplicates found: set(['VAR_0239', 'VAR_0008', 'VAR_0009', 'VAR_0043', 'VAR_0196', 'VAR_0011', 'VAR_0010', 'VAR_0012'])
Checking duplicates of column VAR_0196
Duplicates found: set(['VAR_0239', 'VAR_0008', 'VAR_0009', 'VAR_0043', 'VAR_0196', 'VAR_0011', 'VAR_0010', 'VAR_0012'])
Checking duplicates of column VAR_0011
Duplicates found: set(['VAR_0239', 'VAR_0008', 'VAR_0009', 'VAR_0043', 'VAR_0196', 'VAR_0011', 'VAR_0010', 'VAR_0012'])
Checking duplicates of column VAR_0010
Duplicates found: set(['VAR_0239', 'VAR_0008', 'VAR_0009', 'VAR_0043', 'VAR_0196', 'VAR_0011', 'VAR_0010', 'VAR_0012'])

In [35]:
df = df.drop(cols_to_drop, axis=1)

In [42]:
for c in cols_to_drop:
    bool_cols.remove(c)

Drop duplicate int columns


In [38]:
cols_to_drop = set()
for i in range(len(int_cols) - 1):
    c1 = int_cols[i]
    for j in range(i+1, len(int_cols)):
        c2 = int_cols[j]
        if (df[c1] == df[c2]).all():
            cols_to_drop.add(c2)
print "Duplicates found: " + str(cols_to_drop)


Duplicates found: set([])

Impute missing bool and int values


In [53]:
nan_cols = df[bool_cols + int_cols].isnull().any()
nan_cols[nan_cols]


Out[53]:
Series([], dtype: bool)

So, no missing values to impute.

Remove bool and int columns with only NaNs


In [54]:
nan_cols = df[bool_cols + int_cols].isnull().all()
nan_cols[nan_cols]


Out[54]:
Series([], dtype: bool)

Save preprocessed data to another csv file


In [55]:
df.to_csv(PTRAIN_PATH)

In [56]:
with open('int_cols.pickle', 'wb') as fp:
    pickle.dump(int_cols, fp)

In [57]:
with open('bool_cols.pickle', 'wb') as fp:
    pickle.dump(bool_cols, fp)